import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter, FuncFormatter
from wordcloud import WordCloud, STOPWORDS
df_top50 = pd.read_csv('D:\PyCharmProject\Top_Youtube_News_Media\_Top50_viewed_video_from_each_channels.csv')
df_stats =pd.read_csv('D:\PyCharmProject\Top_Youtube_News_Media\Mainstream_Media_Statistics.csv')
df_stats.head()
| Unnamed: 0 | channelName | id | subscribers | total views | total videos | created date | description | playlistId | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Fox News | UCXIJgqnII2ZOINSWNOGFThA | 10500000 | 14839464315 | 93767 | 2006-09-19T01:48:52Z | FOX News Channel (FNC) is a 24-hour all-encomp... | UUXIJgqnII2ZOINSWNOGFThA |
| 1 | 1 | Al Jazeera English | UCNye-wNBqNL5ZzHSJj3l8Bg | 10500000 | 3343817389 | 105487 | 2006-11-23T14:12:43Z | #AlJazeeraEnglish, we focus on people and even... | UUNye-wNBqNL5ZzHSJj3l8Bg |
| 2 | 2 | Vox | UCLXo7UDZvByw2ixzpQCufnA | 11400000 | 3260217205 | 1593 | 2014-03-04T20:30:22Z | Vox helps you understand our world.\n\nVox vid... | UULXo7UDZvByw2ixzpQCufnA |
| 3 | 3 | Good Morning Britain | UCq18eeL7D9Vd8DhjMcLh9QQ | 1200000 | 1048952422 | 11212 | 2014-08-18T09:23:49Z | The Good Morning Britain YouTube channel deliv... | UUq18eeL7D9Vd8DhjMcLh9QQ |
| 4 | 4 | E! News | UCjDsbbzHgTrGc4Ff26TJtsA | 1220000 | 515416062 | 12501 | 2005-10-25T11:21:48Z | E! News is your go-to source for breaking cele... | UUjDsbbzHgTrGc4Ff26TJtsA |
df_stats.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 43 entries, 0 to 42 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 43 non-null int64 1 channelName 43 non-null object 2 id 43 non-null object 3 subscribers 43 non-null int64 4 total views 43 non-null int64 5 total videos 43 non-null int64 6 created date 43 non-null object 7 description 41 non-null object 8 playlistId 43 non-null object dtypes: int64(4), object(5) memory usage: 3.1+ KB
df_stats['channelName'].unique()
array(['Fox News', 'Al Jazeera English', 'Vox', 'Good Morning Britain',
'E! News', 'ABC News In-depth', 'BBC News', 'ABC News',
'CBC News: The National', 'CNN', 'ITV News', 'On Demand News',
'AFP News Agency', 'CBS Evening News', 'SABC News',
'Forbes Breaking News', 'The Guardian', 'Washington Post',
'euronews', 'CNBC Television', 'NowThis News', 'Daily Mail',
'Voice of America', 'Channel 4 News', 'The Economist', 'CBC News',
'Sky News Australia', 'Global News', 'PBS NewsHour', 'USA TODAY',
'The Telegraph', 'The Sun', 'The New York Times', 'PINAS INSIDER',
'Wall Street Journal', 'DW News', 'CBS News', 'The Young Turks',
'MSNBC', 'Complex', 'Sky News', 'WION', 'NBC News'], dtype=object)
df_stats['channelName'].duplicated().any()
False
df_stats.isna().sum()
Unnamed: 0 0 channelName 0 id 0 subscribers 0 total views 0 total videos 0 created date 0 description 2 playlistId 0 dtype: int64
df_stats=df_stats.fillna('N/A')
#colors for bar graphs and functions for y axis labels
cmap = plt.cm.tab10
colors = cmap(np.arange(10) % cmap.N)
def millions(x, pos):
return '%1.0fM' % (x * 1e-6)
formatterm = FuncFormatter(millions)
def billions(x, pos):
return '%1.0fB' % (x * 1e-9)
formatterb = FuncFormatter(billions)
#Get df based on first feature arg
def get_relation_mul(df,top_n,*col):
df_feature=df.filter([*col])
feature=[*col][0]
df_feature= df_feature.sort_values(feature, ascending=False)
df_feature=df_feature.nlargest(top_n,feature)
return df_feature
df_stats_views=get_relation_mul(df_stats,10,'total views','channelName','subscribers','total videos')
df_stats_views['views/sub']=df_stats_views['total views']/df_stats_views['subscribers']
df_stats_views['views/video']=df_stats_views['total views']/df_stats_views['total videos']
fig,(ax1,ax2,ax3) = plt.subplots(3,1, figsize=(10,20))
fig.tight_layout(h_pad=15.0)
ax1.bar(df_stats_views['channelName'],df_stats_views['total views'], color=colors)
ax2.bar(df_stats_views['channelName'],df_stats_views['views/sub'], color=colors)
ax3.bar(df_stats_views['channelName'],df_stats_views['views/video'], color=colors)
ax1.set_xlabel('Channel')
ax1.set_ylabel('Total Views')
ax1.set_title('Top 10 Viewed Channels')
ax1.yaxis.set_major_formatter(formatterb)
ax2.set_xlabel('Channel')
ax2.set_ylabel('Views per Sub')
ax2.set_title('Views per subscriber')
ax3.set_xlabel('Channel')
ax3.set_ylabel('Views')
ax3.set_title('Views per video')
ax1.tick_params(axis='x', labelrotation = 90)
ax2.tick_params(axis='x', labelrotation = 90)
ax3.tick_params(axis='x', labelrotation = 90)
df_stats_videos=get_relation_mul(df_stats,10,'total videos','channelName','subscribers','total views')
df_stats_videos['videos/sub']=df_stats_videos['total videos']/df_stats_videos['subscribers']
df_stats_videos['views/video']=df_stats_videos['total views']/df_stats_videos['total videos']
fig,(ax1,ax2,ax3) = plt.subplots(3,1, figsize=(10,20))
fig.tight_layout(h_pad=15.0)
ax1.bar(df_stats_videos['channelName'],df_stats_videos['total videos'], color=colors)
ax2.bar(df_stats_videos['channelName'],df_stats_videos['videos/sub'], color=colors)
ax3.bar(df_stats_videos['channelName'],df_stats_videos['views/video'], color=colors)
ax1.set_xlabel('Channel')
ax1.set_ylabel('Total Videos')
ax1.set_title('Channels with Most Videos')
ax2.set_xlabel('Channel')
ax2.set_ylabel('Videos per Sub')
ax2.set_title('Videos per subscriber')
ax3.set_xlabel('Channel')
ax3.set_ylabel('Views/video')
ax3.set_title('Views per Video')
ax1.tick_params(axis='x', labelrotation = 90)
ax2.tick_params(axis='x', labelrotation = 90)
ax3.tick_params(axis='x', labelrotation = 90)
df_stats_subs=get_relation_mul(df_stats,10,'subscribers','total views','total videos','channelName')
df_stats_subs['views/sub']=df_stats_subs['total views']/df_stats_subs['subscribers']
df_stats_subs['sub/video']=df_stats_subs['subscribers']/df_stats_subs['total videos']
fig,(ax1,ax2,ax3) = plt.subplots(3,1, figsize=(10,20))
#fig.suptitle('Top 10 Channels with most videos')
fig.tight_layout(h_pad=15.0)
ax1.bar(df_stats_subs['channelName'],df_stats_subs['subscribers'], color=colors)
ax2.bar(df_stats_subs['channelName'],df_stats_subs['views/sub'], color=colors)
ax3.bar(df_stats_subs['channelName'],df_stats_subs['sub/video'], color=colors)
ax1.set_xlabel('Channel')
ax1.set_ylabel('Total Subscribers')
ax1.set_title('Top 10 Subscribed Channels')
ax1.yaxis.set_major_formatter(formatterm)
ax2.set_xlabel('Channel')
ax2.set_ylabel('Views per Sub')
ax2.set_title('Views per subscriber')
ax3.set_xlabel('Channel')
ax3.set_ylabel('Subscribers per video')
ax3.set_title('Subscribers per video')
ax1.tick_params(axis='x', labelrotation = 90)
ax2.tick_params(axis='x', labelrotation = 90)
ax3.tick_params(axis='x', labelrotation = 90)
stopwords=set(STOPWORDS)
def generate_wordcloud(words, title, max_words=25, figsize=10):
wordCloud=WordCloud(background_color='white',
scale=5,
min_font_size=5,
max_words=max_words,
stopwords=stopwords).generate(words)
plt.figure(figsize=(figsize,figsize))
plt.axis("off")
plt.title(title, pad=20, fontsize=20)
plt.imshow(wordCloud)
plt.show()
generate_wordcloud(str(df_stats['description']),"WordCloud of Channel Descriptions")
df_top50.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2200 entries, 0 to 2199 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 2200 non-null int64 1 Video Id 2200 non-null object 2 channelTitle 2200 non-null object 3 title 2200 non-null object 4 publishedAt 2200 non-null object 5 categoryId 2200 non-null int64 6 description 2198 non-null object 7 viewCount 2196 non-null float64 8 likeCount 2186 non-null float64 9 commentCount 1964 non-null float64 10 duration 2200 non-null object dtypes: float64(3), int64(2), object(6) memory usage: 189.2+ KB
df_top50.head()
| Unnamed: 0 | Video Id | channelTitle | title | publishedAt | categoryId | description | viewCount | likeCount | commentCount | duration | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1400 | spPo80dPUwI | USA TODAY | Heroic mom saves terrified five-year-old from ... | 2022-12-05T17:28:31Z | 25 | A mother came to her five-year-old daughter's ... | 347180823.0 | 10370485.0 | 269937.0 | PT41S |
| 1 | 1900 | NxRGRFKBVLg | On Demand News | Fans Catch CAT Plummeting from Stadium | 2021-09-13T15:02:56Z | 25 | 'Fans Catch CAT Plummeting from Stadium'\n\nFa... | 121244026.0 | 6269691.0 | 95053.0 | PT44S |
| 2 | 1401 | KDo8n_7_2YI | USA TODAY | Brave woman fights off male attacker while alo... | 2023-02-17T13:28:54Z | 25 | A 24-year-old woman successfully fought off a ... | 112886052.0 | 4248640.0 | 92533.0 | PT37S |
| 3 | 1402 | _Gtm2j2gb6w | USA TODAY | Hurricane Fiona devastates Puerto Rico, intens... | 2022-09-22T17:29:38Z | 25 | Hurricane Fiona flooded Puerto Rico with 6 to ... | 97966113.0 | 1504843.0 | 7100.0 | PT42S |
| 4 | 650 | or_MDJnmihc | CBS News | Defendant collapses in court after guilty verdict | 2017-11-14T17:36:13Z | 25 | Diana Lovejoy collapsed in a California courtr... | 75970739.0 | 371733.0 | 79515.0 | PT2M31S |
generate_wordcloud(str(df_top50['description']),"WordCloud of Top 50 videos description from each channel", max_words=50)
generate_wordcloud(str(df_top50['title']),"WordCloud of video Titles", max_words=50)
list_sub=df_stats_subs['channelName'].values.tolist()
#WordCloud for top 10 subscribed channels videos
for ch in list_sub:
df_top50_ch=df_top50.loc[df_top50['channelTitle']==ch]
generate_wordcloud(str(df_top50_ch['description']),f"WordCloud of top subbed channels: {ch}", max_words=50)
list_videos=df_stats_videos['channelName'].values.tolist()
#Wordcloud of top 10 channels with most videos
for ch in list_videos:
df_top50_ch=df_top50.loc[df_top50['channelTitle']==ch]
generate_wordcloud(str(df_top50_ch['description']),f"WordCloud of channels with most videos: {ch}", max_words=50)
list_views=df_stats_views['channelName'].values.tolist()
for ch in list_views:
df_top50_ch=df_top50.loc[df_top50['channelTitle']==ch]
generate_wordcloud(str(df_top50_ch['description']),f"WordCloud of channels with most views: {ch}", max_words=50)
df_top50_ch_list=df_top50['channelTitle'].unique().tolist()
#Get top N videos of each channel based on feature
def get_top_channel_feature(feature1, feature2, top_n):
description_list=[]
for ch in df_top50_ch_list:
df_top50_ch=df_top50.loc[df_top50['channelTitle']==ch]
df_top50_ch=df_top50_ch.sort_values(feature1, ascending=False)
for n in range(top_n):
description=df_top50_ch.iloc[n-1][feature2]
description_list.append(description)
return description_list
description_list_view=get_top_channel_feature('viewCount','description',2)
generate_wordcloud(str(description_list_view),"WordCloud of most viewed videos for each channel", max_words=50)
description_list_like=get_top_channel_feature('likeCount','description',2)
generate_wordcloud(str(description_list_like),"WordCloud of most liked videos for each channel", max_words=50)